library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
theme_set(theme_minimal()) 

# social vulnerability data
CDC_SVI_data <- read.csv('data/2018_CDC_SVI.csv')
# -999 means missing data in this data set
CDC_SVI_data <- CDC_SVI_data %>% filter(E_TOTPOP != 0 & E_CROWD != -999 & E_POV != -999 && E_HU != -999 & EP_POV != -999 & EP_CROWD != -999)

# Urban and rural county classification data from: https://www.cdc.gov/nchs/data_access/urban_rural.htm

urban_rural_data <- read.csv('data/NCHSURCodes2013.csv')
urban_rural_data <- urban_rural_data %>% mutate(FIPS = ï..FIPS.code)

# join urban/rural county designations with the CDC data
CDC_SVI_data <- left_join(CDC_SVI_data , urban_rural_data, by = 'FIPS')

# group crowding counts, household counts, and population counts by state
CDC_SVI_state <- CDC_SVI_data %>% group_by(ST_ABBR) %>% summarise(crowd = sum(E_CROWD), households = sum(E_HU), pop = sum(E_TOTPOP), poverty = sum(E_POV))
## `summarise()` ungrouping output (override with `.groups` argument)
CDC_SVI_state
## # A tibble: 52 x 5
##    ST_ABBR   crowd households      pop poverty
##    <chr>     <int>      <int>    <int>   <int>
##  1 AK        16847     315386   738516   77865
##  2 AL        32038    2244462  4864680  829400
##  3 AR        30795    1362040  2990671  510337
##  4 AZ       113062    2970935  6946685 1092192
##  5 CA      1065030   14084824 39148760 5487141
##  6 CO        55538    2352202  5531141  590504
##  7 CT        25400    1512305  3581504  348449
##  8 DC        10155     311545   684498  109497
##  9 DE         5900     428251   949495  109798
## 10 FL       229353    9348689 20598139 2983851
## # ... with 42 more rows
# Point in time counts of homeless individuals for each individual 
# shelter that performs a count
homeless_data <- read.csv("data/2007-2019-PIT-Counts-by-CoC.csv")

# get the state abbreviation from the shelter id, rename the 2019 count column
homeless_data <- homeless_data %>% mutate('ST_ABBR' = substring(homeless_data$CoC.Number,0,2)) %>% mutate(homeless_count = parse_number(Overall.Homeless..2019)) 

# Sum the homeless counts by state
homeless_data <- homeless_data %>% select(ST_ABBR, homeless_count) %>% group_by(ST_ABBR) %>% summarise(homeless_counts = sum(homeless_count))
## `summarise()` ungrouping output (override with `.groups` argument)
# join homeless counts with CDC data
CDC_SVI_state <- left_join(CDC_SVI_state, homeless_data, by = 'ST_ABBR') 

# get crowding and homeless counts as proportions
CDC_SVI_state <- CDC_SVI_state %>% mutate('crowd_percent' = crowd/households * 100) %>% mutate('pov_percentage' = poverty/pop * 100) 

CDC_SVI_state <- CDC_SVI_state %>% mutate('homeless_percent' = homeless_counts/pop * 100)

CDC_SVI_state
## # A tibble: 52 x 9
##    ST_ABBR  crowd households    pop poverty homeless_counts crowd_percent
##    <chr>    <int>      <int>  <int>   <int>           <dbl>         <dbl>
##  1 AK      1.68e4     315386 7.39e5   77865            1907          5.34
##  2 AL      3.20e4    2244462 4.86e6  829400            3261          1.43
##  3 AR      3.08e4    1362040 2.99e6  510337            2717          2.26
##  4 AZ      1.13e5    2970935 6.95e6 1092192           10007          3.81
##  5 CA      1.07e6   14084824 3.91e7 5487141          151278          7.56
##  6 CO      5.55e4    2352202 5.53e6  590504            9619          2.36
##  7 CT      2.54e4    1512305 3.58e6  348449            3033          1.68
##  8 DC      1.02e4     311545 6.84e5  109497            6521          3.26
##  9 DE      5.90e3     428251 9.49e5  109798             921          1.38
## 10 FL      2.29e5    9348689 2.06e7 2983851           28328          2.45
## # ... with 42 more rows, and 2 more variables: pov_percentage <dbl>,
## #   homeless_percent <dbl>
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
library(htmlwidgets)

CDC_SVI_state <- CDC_SVI_state %>% mutate('State' = ST_ABBR)

# plot homeless & crowding data
homeless_v_crowd <- ggplot(CDC_SVI_state, aes(x=crowd_percent, y=homeless_percent, label = State)) + geom_point() + geom_smooth(method ='lm') + ggtitle("Homeless % vs. Crowding % by US State") +
  xlab("Percentage Crowded Households") + ylab("Percentage Homeless Population") 

ggsave("homelessness_v_crowding.jpg")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
p <- ggplotly(homeless_v_crowd, tooltip = 'State')
## `geom_smooth()` using formula 'y ~ x'
p
htmlwidgets::saveWidget(as_widget(p), "HomelessAndCrowding.html")

CDC_SVI_state %>% select("crowd_percent", "homeless_percent")%>% summary()
##  crowd_percent   homeless_percent 
##  Min.   :1.136   Min.   :0.03962  
##  1st Qu.:1.559   1st Qu.:0.08456  
##  Median :1.929   Median :0.10104  
##  Mean   :2.379   Mean   :0.15574  
##  3rd Qu.:2.540   3rd Qu.:0.14770  
##  Max.   :7.676   Max.   :0.95267
CDC_SVI_data1 <- CDC_SVI_data %>% filter( !is.na(X2013.code))

CDC_SVI_data2 <- CDC_SVI_data1 %>% filter((EP_POV >= 0) & (EP_POV < 35) & (EP_CROWD<8))

poverty_vs_crowd2 <- ggplot(CDC_SVI_data2, aes(x=EP_CROWD, y=EP_POV,color=as.character(X2013.code), alpha = 0.1)) + geom_point() + geom_smooth(method ='lm',  colour="black")+ theme(legend.position = "none") + facet_wrap(~X2013.code) + ggtitle("Poverty vs. Crowding by US County, by Urban (1) to Rural (6)") +xlab("Percentage Crowded Living Conditions") + ylab("Percentage In Poverty") 

poverty_vs_crowd2
## `geom_smooth()` using formula 'y ~ x'

ggsave("poverty_v_crowding.jpg")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
CDC_SVI_data <- CDC_SVI_data %>% filter(EP_CROWD < 10 & !is.na(X2013.code)) 

plot3 <- ggplot(CDC_SVI_data, aes(x = as.character(X2013.code), y = EP_CROWD, color =as.character(X2013.code))) + geom_boxplot() + theme(legend.position = "none") + ggtitle("Crowded Living Conditions in US Counties") +
  xlab("Urban (1) to Rural (6)") + ylab("Percentage in Crowded Living Conditions") 

plot3

ggsave("crowding_boxplot.jpg")
## Saving 7 x 5 in image
CDC_SVI_data %>% select("EP_CROWD", "EP_POV")%>% summary()
##     EP_CROWD         EP_POV     
##  Min.   :0.000   Min.   : 2.30  
##  1st Qu.:1.200   1st Qu.:10.90  
##  Median :1.900   Median :14.70  
##  Mean   :2.249   Mean   :15.46  
##  3rd Qu.:2.800   3rd Qu.:18.90  
##  Max.   :9.800   Max.   :49.70

http://rmarkdown.rstudio.com